

capture cd D:\Dropbox\book_welfare\replication\



 * foreach xx in a b c d e f g h i j k l m n o p q r s t u v w x y z  { 
 foreach xx in    y z  { 

	import delimited "https://www.trussel.com/books/pseud_`xx'.htm", clear encoding(UTF-8)

	replace v1=subinstr(v1,char(34),"",.)
	gen dnew=index(v1,"<span class=PseudoName>") + index(v1,"<span class=RealName>")> 0

	gen penname = v1 if index(v1[_n-1],"Pseudo")>0

	gen xrealname=v1 if index(v1,"mainlinks")>0
	gen realname = v1 if index(v1[_n-1],"RealName")>0 


	split xrealname, parse(">" "<")

	replace realname=xrealname7 if realname==""

	keep v1 penname realname dnew 

	replace penname = subinstr(penname,"</span>","",.)
	replace realname = subinstr(realname,"</span>","",.)


	split v1, parse("<span class=pseudos>")

	capture rename v12 xpenn 
	capture replace xpenn = subinstr(xpenn,"[","",.)
	capture replace xpenn = subinstr(xpenn,"]","",.)
	capture replace xpenn = subinstr(xpenn,"</span>","",.)
	capture split xpenn, parse(",")

	drop v1?

	gen count=sum(dnew)
	gsort count -penname 
	replace penname=penname[_n-1] if penname=="" & count==count[_n-1]

	gsort count -realname 
	replace realname=realname[_n-1] if realname=="" & count==count[_n-1]

	capture gsort count -xpenn  
	capture replace xpenn=xpenn[_n-1] if xpenn=="" & count==count[_n-1]

	capture  keep realname penname count  xpenn v1
	split v1, parse("</span> (")
	capture rename v12 years
	 drop v1*

	duplicates drop 
	rename penname pen_old 
	capture  rename xpenn penname 


	split  penname, parse(", ")

	replace penname1 = pen_old if penname1=="" & pen_old~=""

	drop penname 
	duplicates drop 
	reshape long penname, i(realname count years) j(num)
	 
	split realname, parse(", ")
	capture replace realname = realname2 + " " + realname1 if realname2~=""

	split penname, parse(", ")
	capture replace penname = penname2 + " " + penname1 if penname2~=""


	keep realname penname count years 

	duplicates drop 
	replace realname=subinstr(realname,"(","",.)
	replace realname=subinstr(realname,")","",.)

	drop if penname==""

	gsort realname -years 
	replace years=years[_n-1] if years=="" & realname==realname[_n-1]
	duplicates drop 

save  data\temp_`xx'.dta, replace 

}


use data\temp_a.dta, clear

	foreach x in  b c d e f g h i j k l m n o p q r s t u v w x y z  { 

	append using data\intermediate\temp_`x'.dta
} 

drop if realname==""


replace penname=subinstr(penname,"[","",.)
replace penname=subinstr(penname,"]","",.)

replace penname=subinstr(penname,"(","",.)
replace penname=subinstr(penname,")","",.)

replace realname=subinstr(realname,"(","",.)
replace realname=subinstr(realname,")","",.)


keep penname realname  years
duplicates drop 

replace years=subinstr(years,")","",.)
split years, p("-")
gen byear=real(years1)
gen dyear=real(years2)

split penname, parse(" ") 
split realname, parse(" ") 

keep penname realname penname1 realname1 byear dyear 


			gen NAME=upper(penname1)
			rename NAME name 
			merge m:1 name using data\name_gender_wipo.dta
			rename mshare pmshare 
			drop name 
			drop if _merge==2 
			drop _merge 
			
			gen NAME=upper(realname1)
			rename NAME name 
			merge m:1 name using data\name_gender_wipo.dta
			rename mshare rmshare 
			drop name 
			drop if _merge==2 
			drop _merge 
			
			
			gen realman = rmshare>=.5 & rmshare~=. 
			gen penman = pmshare>=.5 & pmshare~=. 
			
			
			replace realman = 2 if rmshare==. 
			replace penman = 2 if pmshare==. 
			
			duplicates drop 
			distinct 
			
			gsort penname realname 
			bysort penname: gen n=_n 
			keep penname realname rmshare n byear dyear 
			reshape wide realname rmshare, i(penname byear dyear) j(n )
			
			egen rmshare=rmean(rmshare?)
			
			keep penname rmshare byear dyear 
			
		save data\pennames_from_trussel_new.dta, replace 
			
